با توجه به داده بانک جهانی به سوالات زیر پاسخ دهید. برای استفاده از داده از سه فایل زیر استفاده نمایید. داده نام کشورها: WDICountry داده نام سری های زمانی: WDISeries داده کل: WDIData در صورتی که داده را در اختیار ندارید می توانید از بسته WDI استفاده نموده و داده های مورد نظر را استخراج نمایید.

در اینجا پردازش های اولیه و توابع پراستفاده را می نویسیم.

library(ggplot2)
library(ggthemes)
library(tidyr)
library(stringr)
library(WDI)
library(readr)
library(plyr)
library(ggbiplot)
library(dplyr)
library(highcharter)
#setwd("Desktop/96-97-2/Data Analysis/HW/HW10/")
read_csv("../WDI_csv/WDISeries.csv") -> WDISeries
read_csv("../WDI_csv/WDIData.csv") -> WDIData

get_indicator_gathered <- function(indicator, years = 1960:2017, Country_Code = NULL){
  WDIData %>%
    filter(`Indicator Code` == indicator) %>% 
    select(-X63) -> original
  
  if(!is.null(Country_Code)){
    original %>% 
      filter(`Country Code` %in%  Country_Code) -> original
  }
  
  original %>%
    gather("year", "value", 5:62) %>%
    select(Country = `Country Name`, Country_Code = `Country Code`,Indicator = `Indicator Code`, year, value) %>% 
    mutate(value = as.numeric(value)) %>% 
    filter(year %in% years) -> gathered
  return(list(original = original,
              gathered = gathered,
              na_count = gathered %>% 
                group_by(year) %>%
                summarize(na_count = sum(is.na(value))) %>% 
                ungroup() %>% 
                arrange(desc(year)),
              most_recent = gathered %>% 
                filter(!is.na(value)) %>%
                group_by(Country, Country_Code) %>% 
                top_n(wt = year, n = 1) %>% 
                ungroup()))
}

get_multiple_indicator_gathered <- function(indicators, years = 1960:2017, Country_Code = NULL){
  WDIData %>%
    filter(`Indicator Code` %in%  indicators) %>% 
    select(-X63) -> original
  
  if(!is.null(Country_Code)){
    original %>% 
      filter(`Country Code` %in%  Country_Code) -> original
  }
  
  original %>%
    gather("year", "value", 5:62) %>%
    select(Country = `Country Name`, Country_Code = `Country Code`,Indicator = `Indicator Code`, year, value) %>% 
    mutate(value = as.numeric(value)) %>% 
    filter(year %in% years) -> gathered
  return(list(original = original,
              gathered = gathered,
              na_count = gathered %>% 
                group_by(year, Indicator) %>%
                summarize(na_count = sum(is.na(value))) %>% 
                ungroup() %>% 
                arrange(desc(year)),
              most_recent = gathered %>% 
                filter(!is.na(value)) %>%
                group_by(Country, Country_Code, Indicator) %>% 
                top_n(wt = year, n = 1) %>% 
                ungroup()))
}

cor_tester <- function(indicator1, indicator2, Xpre = 2, Ypre = 2, years = NULL){
  WDISeries %>%
    filter(`Series Code` == indicator1) %>% 
    select(`Indicator Name`) %>% as.data.frame() %>% .[1,1]-> full_name_1
  
  str_sub(full_name_1, end = str_locate(full_name_1, "\\(") -2) %>% .[1]-> name_1
  
  WDISeries %>%
    filter(`Series Code` == indicator2) %>% 
    select(`Indicator Name`) %>% as.data.frame() %>% .[1,1] -> full_name_2
  
  str_sub(full_name_2, end = str_locate(full_name_2, "\\(")- 2) %>% .[1] -> name_2
  if(is.null(years)){
    get_indicator_gathered(indicator1)$most_recent %>% select(-Country) %>%
      right_join(get_indicator_gathered(indicator2)$most_recent,
                 by = c("Country_Code")) %>%
      mutate(Xyear= year.x,
             Xval = round(value.x,Xpre),
             Yyear = year.y,
             Yval = round(value.y,Ypre)) %>%
      drop_na() -> Y_X
  } else {
    get_indicator_gathered(indicator1)$gathered %>% select(-Country) %>%
      inner_join(get_indicator_gathered(indicator2)$gathered,
                 by = c("Country_Code", "year")) %>%
      mutate(Xyear= year,
             Xval = round(value.x,Xpre),
             Yyear = year,
             Yval = round(value.y,Ypre)) %>%
      drop_na() -> Y_X
  }
  
  return(list(full_name_1 = full_name_1,
              full_name_2 = full_name_2,
              name_1 = name_1,
              name_2 = name_2,
              cor_test = cor.test(Y_X$Yval, Y_X$Xval, method = "spearman"),
              plot = Y_X %>% 
                hchart(
                type = "scatter",
                hcaes(x = Xval, y = Yval, color = Country),
                name = paste0(name_2, " / ",name_1),
                tooltip = list(pointFormat =paste0("<b> {point.Country} <b/> <br/>",
                               full_name_1 , " ({point.Xyear}): {point.Xval} <br/>",
                               full_name_2 , " ({point.Yyear}): {point.Yval} <br/>"))
                ) %>%
                hc_title(text = paste0(name_2, " / ",name_1)) %>%
                hc_xAxis(title = list(text = name_1)) %>%
                hc_yAxis(title = list(text = name_2))  %>%
                hc_tooltip(crosshairs = T) %>%
                hc_add_theme(hc_theme_sandsignika())
                ))
}

۱. ده کشور فقیر دنیا را بیابید. نمودار درآمد روزانه آنها را رسم کنید. چند درصد از اعضای این کشورها زیر خط فقر هستند؟ متوسط عمر در این کشورها چقدر است؟

می توانید مقدار مربوط به هر قسمت را با بردن موس روی قسمت مورد نظر ببینید.

get_indicator_gathered("NY.GDP.PCAP.PP.CD")$most_recent %>%
  arrange(value) %>% head(10) -> poorest

poorest %>% mutate(daily = round(value / 365, 2)) %>% hchart(
  type = "bar",
  hcaes(x = Country, y = daily),
  tooltip = list(pointFormat =
                   "Daily Income per Person: {point.daily}$<br/>")
) %>%
  hc_title(text = "Poorest") %>%
  hc_xAxis(title = list(text = "Country")) %>%
  hc_yAxis(title = list(text = "Daily Income"))  %>%
  hc_tooltip(crosshairs = T) %>%
  hc_add_theme(hc_theme_sandsignika())
get_indicator_gathered("SI.POV.NAHC")$most_recent %>%
  select(-Country) %>%
  right_join(poorest %>% head(10), by = c("Country_Code")) %>%
  mutate(daily = round(value.y / 365, 2), UnderPovertyLine = value.x, uplyear =year.x, pvyear = year.y) %>%
  hchart(
    type = "bar",
    hcaes(x = Country, y = UnderPovertyLine),
    tooltip = list(pointFormat =
                     "Daily Income per Person ({point.pvyear}): {point.daily}$ <br/> 
                      Under Poverty line Percent ({point.uplyear}): {point.UnderPovertyLine}%<br/>")
  ) %>%
  hc_title(text = "Poorest (Under Poverty Line Percent)") %>%
  hc_xAxis(title = list(text = "Country")) %>%
  hc_yAxis(title = list(text = "Percent"))  %>%
  hc_tooltip(crosshairs = T) %>%
  hc_add_theme(hc_theme_sandsignika())
get_indicator_gathered("SP.DYN.LE00.IN")$most_recent %>% select(-Country) %>%
  right_join(poorest %>% head(10), by = c("Country_Code")) %>%
  mutate(daily = round(value.y / 365, 2), LE = value.x, LEyear =year.x, pvyear = year.y) %>%
  hchart(
    type = "bar",
    hcaes(x = Country, y = LE),
    tooltip = list(pointFormat =
                     "Daily Income per Person ({point.pvyear}): {point.daily}$ <br/> 
                   Life Expectancy ({point.LEyear}): {point.LE} years<br/>")
    ) %>%
  hc_title(text = "Poorest (Life Expectancy)") %>%
  hc_xAxis(title = list(text = "Country")) %>%
  hc_yAxis(title = list(text = "years"))  %>%
  hc_tooltip(crosshairs = T) %>%
  hc_add_theme(hc_theme_sandsignika())

۲. تراژدی روآندا: بر اساس داده های امید به زندگی ابتدا نمودار سالانه نمودار جعبه ایی امید به زندگی کشورها را رسم نمایید(در یک نمودار!). سپس سری زمانی امید به زندگی روآندا را به آن اضافه کنید. چه می بینید؟ چند میلیون نفر کشته شدند؟

ggplot(get_indicator_gathered("SP.DYN.LE00.IN")$gathered, aes(x = year, y = value)) +
  geom_boxplot(fill = "3")+
  geom_line(data = get_indicator_gathered("SP.DYN.LE00.IN")$gathered %>%
              filter(Country_Code == "RWA"), aes(x = year, y = value, group ="1"), color = "4")+
  xlab("Year") +
  ylab("Years") +
  ggtitle("Life Expectancy")+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 8, hjust = 1),
        axis.text.y = element_text(angle = 0, vjust = 1, 
                                   size = 8, hjust = 1))

## From Wikipedia: Death toll is about 500k to 1m

۳. نمودار امید به زندگی و هزینه های بهداشتی را رسم کنید. چه نتیجه ایی می گیرید؟

get_indicator_gathered("SP.DYN.LE00.IN")$most_recent %>% select(-Country) %>%
  right_join(get_indicator_gathered("SH.XPD.CHEX.PP.CD")$most_recent,
             by = c("Country_Code")) %>% 
mutate(LEyear = year.x, LE = round(value.x,2), HEyear = year.y, HE = round(value.y,2)) -> LE_HE

LE_HE %>% 
hchart(
  type = "scatter",
  hcaes(x = HE, y = LE),
  name = "Health Expenditture per Capita / Life Expectancy",
  tooltip = list(pointFormat ="<b> {point.Country} <b/> <br/>
                 Life Expectancy ({point.LEyear}): {point.LE} years<br/>
                 Health Expenditture per Capita ({point.HEyear}): {point.HE}$<br/>")
  ) %>%
  hc_title(text = "Life Expectancy / Health Expenditture per Capita") %>%
  hc_xAxis(title = list(text = "Health Expenditure")) %>%
  hc_yAxis(title = list(text = "Life Expectancy"))  %>%
  hc_tooltip(crosshairs = T) %>%
  hc_add_theme(hc_theme_sandsignika())
LE_HE %>% 
  hchart(
    type = "scatter",
    hcaes(x = round(log(HE),2), y = LE),
    name = "Logarithm of Health Expenditture per Capita / Life Expectancy",
    tooltip = list(pointFormat ="<b> {point.Country} <b/> <br/>
                   Life Expectancy ({point.LEyear}): {point.LE} years<br/>
                   Logarithm of Health Expenditture per Capita ({point.HEyear}): {point.HE} log($)<br/>")
    ) %>%
  hc_title(text = "Life Expectancy / Logarithm of Health Expenditture per Capita") %>%
  hc_xAxis(title = list(text = "Logarithm of Health Expenditure")) %>%
  hc_yAxis(title = list(text = "Life Expectancy"))  %>%
  hc_tooltip(crosshairs = T) %>%
  hc_add_theme(hc_theme_sandsignika())
cor.test(log(LE_HE$HE), LE_HE$LE)
## 
##  Pearson's product-moment correlation
## 
## data:  log(LE_HE$HE) and LE_HE$LE
## t = 23.278, df = 229, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7954079 0.8730079
## sample estimates:
##       cor 
## 0.8384068
### Life Expectancy andLogarithm of Health Expenditture per Capita have linear relation

۴. آیا قدرت خرید خانواده های ایرانی در ۵۰ سال اخیر افزایش یافته است؟ برای این کار از داده های اقتصادی خانوار استفاده کنید.

بله ۳ برابر شده است.

# Household final consumption expenditure per capita (constant 2010 US$)
get_indicator_gathered("NE.CON.PRVT.PC.KD", Country_Code = c("IRN"))$gathered -> iran_cfcepc
iran_cfcepc %>% 
  mutate(value = round(value, 2)) %>% 
  hchart(
  type = "line",
  hcaes(x = year, y = value),
  tooltip = list(pointFormat ="Household expenditure per capita: {point.value}  (constant 2010 US$)<br/>")
  ) %>%
  hc_title(text = "Iran's Household final consumption expenditure per capita") %>%
  hc_xAxis(title = list(text = "Year")) %>%
  hc_yAxis(title = list(text = "Household Expenditure per capita (constant 2010 US$)"))  %>%
  hc_tooltip(crosshairs = T) %>%
  hc_add_theme(hc_theme_sandsignika())
iran_cfcepc %>% filter(year == 2016) %>% .$value / iran_cfcepc %>% filter(year == 1966) %>% .$value
## [1] 3.130246
### Almost 3 times

۵. رشد اقتصادی ایران را با کشورهای دیگر در طول ۲۰ سال گذشته بر حسب بیست شاخص های اقتصادی مهم مانند تولید ناخالص ملی، تورم و … ارزیابی کنید! (برای هر شاخص از تصویرسازی استفاده کنید.)

۲۰ شاخص اقتصادی خوب را برحسب بیشینه ی موجود بودن آن ها جدا می کنیم.

خواهیم دید که وضعیت ایران در اکثر شاخص های اقتصادی با جهان متفاوت است و در اکثر موارد خیلی نزدیک به سایر جهان نیست.

WDIData %>%
  gather("year", "value", 5:62) %>%
  select(Country = `Country Name`, Country_Code = `Country Code`,Indicator = `Indicator Code`, year, value) %>% 
  filter(Country_Code == "IRN") %>%
  group_by(Indicator) %>% 
  summarize(how_good = sum(!is.na(value)), has_20_years = sum(!is.na(value[year <= 2016 & year >= 1997]))) %>%
  ungroup() %>% 
  left_join(WDISeries, by = c("Indicator" ="Series Code")) %>% 
  .[,1:7] %>% 
  filter(str_detect(`Indicator Name`, "(annual %)")) %>%
  filter(!str_detect(`Indicator Name`, "per")) %>% 
  arrange(desc(has_20_years),desc(how_good)) %>%
  select(Indicator, `Indicator Name`) -> selected_indicators

#selected_indicators %>% View()

selected_indicators$Indicator[c(-11)] %>% .[1:20]-> economic_indicators

#economic_indicators
## "NY.GDP.MKTP.KD.ZG" : GDP annual growth
## ATTENTION: (annual % growth) 
## Economic Growth is NY.GDP.MKTP.KD.ZG
## Economic growth is the increase in the inflation-adjusted market value of the
## goods and services produced by an economy over time.
## It is conventionally measured as the percent rate of increase in real gross domestic product,
## or real GDP

# c("FP.CPI.TOTL",
# "BX.GRT.EXTA.CD.WD",
# "DT.ODA.ALLD.KD",
# "FP.CPI.TOTL.ZG",
# "NE.CON.GOVT.KD",
# "NE.DAB.TOTL.KD",
# "NE.CON.TOTL.KD",
# "NE.EXP.GNFS.KD",
# "NE.GDI.FTOT.KD",
# "NE.IMP.GNFS.KD",
# "NV.AGR.TOTL.KD",
# "NV.IND.MANF.KD",
# "NV.IND.TOTL.KD",
# "NV.SRV.TETC.KD",
# "NY.GDP.FCST.KD",
# "NY.GDP.MKTP.KD",
# "NY.GNP.MKTP.KD.ZG",
# "FS.AST.PRVT.GD.ZS",
# "NY.ADJ.NNTY.PC.KD.ZG",
# "NE.GDI.TOTL.KD.ZG") -> economic_indicators

plots <- list()
for(i in economic_indicators){
  indicator_name <- WDISeries %>% filter(`Series Code` == i) %>% .$`Indicator Name`
  get_indicator_gathered(i)$gathered %>% .$value %>% na.omit() -> dfry 
  ylim <- boxplot.stats(dfry)$stats[c(1, 5)]
  ggplot(get_indicator_gathered(i)$gathered, aes(x = year, y = value)) +
    geom_boxplot(fill = "3", outlier.shape = NA)+
    coord_cartesian(ylim = quantile(dfry, c(0.1, 0.9)))+
    geom_line(data = get_indicator_gathered(i)$gathered %>%
                filter(Country_Code == "IRN"), aes(x = year, y = value, group ="1"), color = "4")+
    xlab("Year") +
    ggtitle(indicator_name)+
    ggpubr::theme_classic2() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                     size = 5, hjust = 1),
          axis.text.y = element_text(angle = 0, vjust = 1, 
                                     size = 5, hjust = 1),
          title = element_text(angle = 0, vjust = 1, 
                               size = 6, hjust = 1)) -> plots[[i]]
}
ggpubr::ggarrange(plotlist = plots, ncol = 5, nrow = 4)

۶. در قسمت قبل با استفاده از روش خوشه بندی k-means داده ها را به سه دسته تقسیم کنید. ایران در کدام دسته می گنجد؟ (پیش از خوشه بندی طبیعتا داده را باید پاکسازی و استاندارد سازی نمایید.)

get_multiple_indicator_gathered(indicators = economic_indicators)$most_recent %>% 
  select(-year) %>%
  spread(key = Indicator, value = value) %>% 
  select(-Country_Code) %>%
  drop_na() -> mymatrix

set.seed(400)
kcl = kmeans(mymatrix[,-1] %>% scale(), centers = 3)
mymatrix$Cluster <- as.factor(kcl$cluster)
mymatrix %>% select(Country, Cluster, NY.GNP.MKTP.KD.ZG ) -> clusters

## clusters %>% View()

for(i in 1:3) {
  clusterCountries = clusters[clusters$Cluster == i,]$Country
  cat("\n\nCluster No.", i, ":\n")
  cat(clusterCountries, sep = ", ")
}
## 
## 
## Cluster No. 1 :
## Afghanistan, Bahrain, Bangladesh, Benin, Bhutan, Bolivia, Botswana, Burkina Faso, Cambodia, Cameroon, Congo, Dem. Rep., Cote d'Ivoire, Dominican Republic, Egypt, Arab Rep., Ethiopia, Ghana, Guatemala, Honduras, Iceland, India, Indonesia, Iran, Islamic Rep., Israel, Jordan, Kenya, Kuwait, Kyrgyz Republic, Lao PDR, Lesotho, Madagascar, Malawi, Mozambique, New Zealand, Nicaragua, Niger, Oman, Pakistan, Papua New Guinea, Philippines, Rwanda, Senegal, Sierra Leone, Sudan, Tajikistan, Tanzania, Togo, Uganda, United Arab Emirates, Vanuatu, Vietnam
## 
## Cluster No. 2 :
## Argentina, Belarus, Brazil, Burundi, Chad, Congo, Rep., Ecuador, Equatorial Guinea, Gambia, The, Liberia, Nepal, Nigeria, Saudi Arabia, Venezuela, RB, Zimbabwe
## 
## Cluster No. 3 :
## Albania, Armenia, Australia, Azerbaijan, Bahamas, The, Belize, Bosnia and Herzegovina, Brunei Darussalam, Bulgaria, Cabo Verde, Canada, Central African Republic, Chile, Colombia, Comoros, Costa Rica, Croatia, Czech Republic, Denmark, El Salvador, Gabon, Georgia, Haiti, Hungary, Jamaica, Japan, Kazakhstan, Korea, Rep., Lebanon, Macedonia, FYR, Malaysia, Mauritania, Mauritius, Mexico, Moldova, Mongolia, Montenegro, Morocco, Namibia, Norway, Panama, Paraguay, Peru, Poland, Romania, Russian Federation, Serbia, Seychelles, South Africa, Sri Lanka, Swaziland, Sweden, Switzerland, Syrian Arab Republic, Thailand, Trinidad and Tobago, Tunisia, Turkey, Ukraine, United Kingdom, United States, Uruguay
iran_clus = clusters$Cluster[clusters$Country == 'Iran, Islamic Rep.']
cat('Iran\'s cluster:', iran_clus)
## Iran's cluster: 1

۷. به وسیله تحلیل مولفه اصلی بعد داده رو به دو کاهش دهید سپس خوشه های به دست آمده در قسمت قبل را بر روی آن نمایش دهید. آیا عملکرد روش خوشه بندی شما مطلوب بوده است؟

بله مطلوب بوده است و به طور کلی اقتصاد های قوی تر در دسته ی سوم و اقتصاد های ضعیف تر در دسته ی اول و دوم هستند.

pca = prcomp(mymatrix %>% select(-Country, -Cluster), center = T, scale. = T)

ggbiplot::ggbiplot(pca, 1:2, obs.scale = 1, var.scale = 1, groups =clusters$Cluster,
                   labels = clusters$Country, ellipse = TRUE, circle = TRUE)+
  guides(color = F)


۸. با استفاده از داده روشی برای پیش بینی رشد اقتصادی ایران در سال آینده ارائه دهید.

get_multiple_indicator_gathered(indicators = economic_indicators, Country_Code = "IRN")$gathered %>% 
  spread(key = Indicator, value = value) %>% 
  select(-Country_Code, -Country) %>% 
  arrange(year) %>% 
  mutate( next_year_growth = lead(NY.GDP.MKTP.KD.ZG)) %>% 
  select(-year) %>% 
  drop_na()-> iran_data

library(h2o)
localH2O = h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 hours 3 minutes 
##     H2O cluster timezone:       Asia/Tehran 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.20.0.2 
##     H2O cluster version age:    1 month and 5 days  
##     H2O cluster name:           H2O_started_from_R_Mahbod_jyk252 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.61 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.1 (2018-07-02)
h2o.no_progress()

new_matrix <- iran_data %>% 
  as.h2o()
           
model<- h2o.glm(y = "next_year_growth",
                training_frame = new_matrix, nfolds = 5)
model
## Model Details:
## ==============
## 
## H2ORegressionModel: glm
## Model ID:  GLM_model_R_1532095513044_5 
## GLM Model: summary
##     family     link                              regularization
## 1 gaussian identity Elastic Net (alpha = 0.5, lambda = 0.5999 )
##   number_of_predictors_total number_of_active_predictors
## 1                         20                          13
##   number_of_iterations             training_frame
## 1                    2 file5a6711b9833_sid_9df1_2
## 
## Coefficients: glm coefficients
##               names coefficients standardized_coefficients
## 1         Intercept     7.286518                  3.378958
## 2    FM.LBL.BMNY.ZG    -0.120282                 -0.783543
## 3    FP.CPI.TOTL.ZG     0.000000                  0.000000
## 4 NE.CON.GOVT.KD.ZG     0.299011                  1.453140
## 5 NE.CON.PETC.KD.ZG    -0.066735                 -1.083158
## 
## ---
##                names coefficients standardized_coefficients
## 16 NY.GDP.DEFL.KD.ZG    -0.010634                 -0.100078
## 17 NY.GDP.MKTP.KD.ZG     0.075533                  0.597234
## 18 NY.GNP.MKTP.KD.ZG     0.066657                  0.492607
## 19       SP.POP.GROW     0.000000                  0.000000
## 20    SP.RUR.TOTL.ZG     0.000000                  0.000000
## 21       SP.URB.GROW     0.000000                  0.000000
## 
## H2ORegressionMetrics: glm
## ** Reported on training data. **
## 
## MSE:  37.73331
## RMSE:  6.142745
## MAE:  4.499093
## RMSLE:  NaN
## Mean Residual Deviance :  37.73331
## R^2 :  0.3948841
## Null Deviance :1933.072
## Null D.o.F. :30
## Residual Deviance :1169.733
## Residual D.o.F. :17
## AIC :230.521
## 
## 
## 
## H2ORegressionMetrics: glm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  183.6414
## RMSE:  13.55143
## MAE:  8.284289
## RMSLE:  NaN
## Mean Residual Deviance :  183.6414
## R^2 :  -1.944993
## Null Deviance :2120.509
## Null D.o.F. :30
## Residual Deviance :5692.883
## Residual D.o.F. :14
## AIC :285.5767
## 
## 
## Cross-Validation Metrics Summary: 
##                              mean        sd cv_1_valid cv_2_valid
## mae                     8.5776415  2.336261   5.354284   7.193151
## mean_residual_deviance  187.81822 148.41985  40.011875 110.701546
## mse                     187.81822 148.41985  40.011875 110.701546
## null_deviance           424.10184 194.30229  264.17218  595.43085
## r2                     -1.7347221 0.7817357 -0.2839509 -1.8435653
## residual_deviance       1138.5767  898.6093  280.08313   996.3139
## rmse                    12.111865  4.534366  6.3254943  10.521481
## rmsle                         0.0       NaN        NaN        NaN
##                        cv_3_valid cv_4_valid cv_5_valid
## mae                      9.150004  14.706596   6.484171
## mean_residual_deviance   104.5933   604.6533   79.13103
## mse                      104.5933   604.6533   79.13103
## null_deviance           143.40692   880.8932  236.60599
## r2                     -1.7050694 -3.6560342 -1.1849911
## residual_deviance        313.7799    3627.92  474.78616
## rmse                    10.227087  24.589699   8.895562
## rmsle                         NaN        NaN        NaN
h2o.mse(model)
## [1] 37.73331

۹. سوالهای ۵ تا ۷ را ابتدا برای ۲۰ شاخص سلامت سپس بر حسب ۲۰ شاخص آموزشی تکرار کنید.

WDIData %>%
  gather("year", "value", 5:62) %>%
  select(Country = `Country Name`, Country_Code = `Country Code`,Indicator = `Indicator Code`, year, value) %>%
  filter(Country_Code == "IRN") %>%
  group_by(Indicator) %>%
  summarize(how_good = sum(!is.na(value)), has_20_years = sum(!is.na(value[year <= 2016 & year >= 1997]))) %>%
  ungroup() %>% 
  left_join(WDISeries, by = c("Indicator" ="Series Code")) %>%
  .[,1:7] %>%
  filter(str_detect(Topic, "Health")) %>% 
  filter(has_20_years == 20) %>% 
  filter(!str_detect(Indicator, "SP.POP")) %>% 
  filter(Indicator != "SH.VAC.TTNS.ZS") %>% 
  arrange(desc(has_20_years),desc(how_good)) %>% 
  .$Indicator %>% 
  .[1:20]-> health_indicators


plots <- list()
for(i in health_indicators){
  indicator_name <- WDISeries %>% filter(`Series Code` == i) %>% .$`Indicator Name`
  get_indicator_gathered(i)$gathered %>% .$value %>% na.omit() -> dfry 
  ylim <- boxplot.stats(dfry)$stats[c(1, 5)]
  ggplot(get_indicator_gathered(i)$gathered, aes(x = year, y = value)) +
    geom_boxplot(fill = "3", outlier.shape = NA)+
    coord_cartesian(ylim = quantile(dfry, c(0.1, 0.9)))+
    geom_line(data = get_indicator_gathered(i)$gathered %>%
                filter(Country_Code == "IRN"), aes(x = year, y = value, group ="1"), color = "4")+
    xlab("Year") +
    ggtitle(indicator_name)+
    ggpubr::theme_classic2() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                     size = 5, hjust = 1),
          axis.text.y = element_text(angle = 0, vjust = 1, 
                                     size = 5, hjust = 1),
          title = element_text(angle = 0, vjust = 1, 
                               size = 6, hjust = 1)) -> plots[[i]]
}
ggpubr::ggarrange(plotlist = plots, ncol = 5, nrow = 4)

get_multiple_indicator_gathered(indicators = health_indicators)$most_recent %>% 
  select(-year) %>%
  spread(key = Indicator, value = value) %>% 
  select(-Country_Code) %>% 
  drop_na() -> mymatrix

set.seed(400)
kcl = kmeans(mymatrix[,-1] %>% scale(), centers = 3)
mymatrix$Cluster <- as.factor(kcl$cluster)
mymatrix %>% select(Country, Cluster) -> clusters

## clusters %>% View()

for(i in 1:3) {
  clusterCountries = clusters[clusters$Cluster == i,]$Country
  cat("\n\nCluster No.", i, ":\n")
  cat(clusterCountries, sep = ", ")
}
## 
## 
## Cluster No. 1 :
## Albania, Antigua and Barbuda, Argentina, Armenia, Australia, Austria, Bahamas, The, Bahrain, Barbados, Belarus, Belgium, Bosnia and Herzegovina, Brunei Darussalam, Bulgaria, Canada, Central Europe and the Baltics, Chile, China, Costa Rica, Croatia, Cuba, Cyprus, Czech Republic, Denmark, East Asia & Pacific, East Asia & Pacific (excluding high income), East Asia & Pacific (IDA & IBRD countries), Estonia, Euro area, Europe & Central Asia, European Union, Finland, France, Georgia, Germany, Greece, High income, Hungary, Iceland, Iran, Islamic Rep., Ireland, Israel, Italy, Jamaica, Japan, Korea, Rep., Kuwait, Late-demographic dividend, Latvia, Lebanon, Lithuania, Luxembourg, Macedonia, FYR, Malaysia, Maldives, Malta, Mauritius, Mexico, Montenegro, Netherlands, New Zealand, North America, Norway, OECD members, Oman, Poland, Portugal, Post-demographic dividend, Qatar, Romania, Serbia, Singapore, Slovak Republic, Slovenia, Spain, Sri Lanka, St. Lucia, Sweden, Switzerland, Thailand, Tunisia, Turkey, United Arab Emirates, United Kingdom, United States, Upper middle income, Uruguay, Vietnam
## 
## Cluster No. 2 :
## Algeria, Arab World, Azerbaijan, Bangladesh, Belize, Bhutan, Bolivia, Botswana, Brazil, Cabo Verde, Cambodia, Caribbean small states, Colombia, Dominican Republic, Early-demographic dividend, Ecuador, Egypt, Arab Rep., El Salvador, Europe & Central Asia (excluding high income), Europe & Central Asia (IDA & IBRD countries), Fiji, Grenada, Guatemala, Guyana, Honduras, IBRD only, IDA & IBRD total, India, Indonesia, Iraq, Jordan, Kazakhstan, Kenya, Kiribati, Korea, Dem. People’s Rep., Kyrgyz Republic, Lao PDR, Latin America & Caribbean, Latin America & Caribbean (excluding high income), Latin America & the Caribbean (IDA & IBRD countries), Libya, Low & middle income, Lower middle income, Micronesia, Fed. Sts., Middle East & North Africa, Middle East & North Africa (excluding high income), Middle East & North Africa (IDA & IBRD countries), Middle income, Moldova, Mongolia, Morocco, Myanmar, Nepal, Nicaragua, Other small states, Pacific island small states, Panama, Paraguay, Peru, Philippines, Russian Federation, Rwanda, Samoa, Sao Tome and Principe, Saudi Arabia, Senegal, Seychelles, Small states, Solomon Islands, South Asia, South Asia (IDA & IBRD), St. Vincent and the Grenadines, Suriname, Syrian Arab Republic, Tajikistan, Timor-Leste, Tonga, Trinidad and Tobago, Turkmenistan, Ukraine, Uzbekistan, Vanuatu, Venezuela, RB, World
## 
## Cluster No. 3 :
## Afghanistan, Angola, Benin, Burkina Faso, Burundi, Cameroon, Central African Republic, Chad, Comoros, Congo, Dem. Rep., Congo, Rep., Cote d'Ivoire, Djibouti, Equatorial Guinea, Eritrea, Ethiopia, Fragile and conflict affected situations, Gabon, Gambia, The, Ghana, Guinea, Guinea-Bissau, Haiti, Heavily indebted poor countries (HIPC), IDA blend, IDA only, IDA total, Least developed countries: UN classification, Lesotho, Liberia, Low income, Madagascar, Malawi, Mali, Mauritania, Mozambique, Namibia, Niger, Nigeria, Pakistan, Papua New Guinea, Pre-demographic dividend, Sierra Leone, Somalia, South Africa, South Sudan, Sub-Saharan Africa, Sub-Saharan Africa (excluding high income), Sub-Saharan Africa (IDA & IBRD countries), Sudan, Swaziland, Tanzania, Togo, Uganda, Yemen, Rep., Zambia, Zimbabwe
iran_clus = clusters$Cluster[clusters$Country == 'Iran, Islamic Rep.']
cat('Iran\'s cluster:', iran_clus)
## Iran's cluster: 1
pca = prcomp(mymatrix %>% select(-Country, -Cluster), center = T, scale. = T)

ggbiplot(pca, 1:2, obs.scale = 1, var.scale = 1, groups =clusters$Cluster,
                   labels = clusters$Country, ellipse = TRUE, circle = TRUE)+
  guides(color = F)

بله مطلوب بوده است و به طور کلی کشورهای با بهداشت بهتر در دسته ی دوم کشورهای با بهداشت معمولی در دسته ی سوم و کشورهای با بهداشت ضعیف تر در دسته ی اول هستند.

WDIData %>%
  gather("year", "value", 5:62) %>%
  select(Country = `Country Name`, Country_Code = `Country Code`,Indicator = `Indicator Code`, year, value) %>%
  filter(Country_Code == "IRN") %>%
  group_by(Indicator) %>%
  summarize(how_good = sum(!is.na(value)), has_20_years = sum(!is.na(value[year <= 2016 & year >= 1997]))) %>%
  ungroup() %>% 
  left_join(WDISeries, by = c("Indicator" ="Series Code")) %>%
  .[,1:7] %>%
  filter(str_detect(Topic, "Education")) %>% 
  #filter(has_20_years == 20) %>% 
  arrange(desc(has_20_years),desc(how_good)) %>%
  .$Indicator %>% 
  .[1:20]-> edu_indicators


plots <- list()
for(i in edu_indicators){
  indicator_name <- WDISeries %>% filter(`Series Code` == i) %>% .$`Indicator Name`
  get_indicator_gathered(i)$gathered %>% .$value %>% na.omit() -> dfry 
  ylim <- boxplot.stats(dfry)$stats[c(1, 5)]
  ggplot(get_indicator_gathered(i)$gathered, aes(x = year, y = value)) +
    geom_boxplot(fill = "3", outlier.shape = NA)+
    coord_cartesian(ylim = quantile(dfry, c(0.1, 0.9)))+
    geom_line(data = get_indicator_gathered(i)$gathered %>%
                filter(Country_Code == "IRN"), aes(x = year, y = value, group ="1"), color = "4")+
    xlab("Year") +
    ggtitle(indicator_name)+
    ggpubr::theme_classic2() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                     size = 5, hjust = 1),
          axis.text.y = element_text(angle = 0, vjust = 1, 
                                     size = 5, hjust = 1),
          title = element_text(angle = 0, vjust = 1, 
                               size = 6, hjust = 1)) -> plots[[i]]
}
ggpubr::ggarrange(plotlist = plots, ncol = 5, nrow = 4)

get_multiple_indicator_gathered(indicators = edu_indicators)$most_recent %>% 
  select(-year) %>%
  spread(key = Indicator, value = value) %>% 
  select(-Country_Code) %>%
  drop_na() -> mymatrix

set.seed(400)
kcl = kmeans(mymatrix[,-1] %>% scale(), centers = 3)
mymatrix$Cluster <- as.factor(kcl$cluster)
mymatrix %>% select(Country, Cluster) -> clusters

## clusters %>% View()

for(i in 1:3) {
  clusterCountries = clusters[clusters$Cluster == i,]$Country
  cat("\n\nCluster No.", i, ":\n")
  cat(clusterCountries, sep = ", ")
}
## 
## 
## Cluster No. 1 :
## Afghanistan, Angola, Central African Republic, Chad, Cote d'Ivoire, Djibouti, Equatorial Guinea, Eritrea, Guinea, Guinea-Bissau, Iraq, Mali, Pakistan, South Sudan, Togo, Yemen, Rep.
## 
## Cluster No. 2 :
## Albania, Algeria, Andorra, Antigua and Barbuda, Argentina, Armenia, Aruba, Australia, Austria, Barbados, Belarus, Belgium, Bermuda, Bolivia, Brazil, Brunei Darussalam, Bulgaria, Cabo Verde, Canada, Chile, Costa Rica, Croatia, Cuba, Cyprus, Czech Republic, Denmark, Dominica, Ecuador, El Salvador, Estonia, Finland, France, Germany, Ghana, Grenada, Guyana, Hong Kong SAR, China, Hungary, Iceland, Ireland, Israel, Italy, Jamaica, Japan, Kenya, Korea, Rep., Kuwait, Latvia, Lebanon, Liberia, Liechtenstein, Lithuania, Luxembourg, Macao SAR, China, Malaysia, Malta, Marshall Islands, Mauritius, Mexico, Moldova, Mongolia, Netherlands, New Zealand, Norway, Palau, Peru, Philippines, Portugal, Romania, Russian Federation, San Marino, Seychelles, Slovak Republic, Slovenia, South Africa, Spain, Sri Lanka, St. Lucia, St. Vincent and the Grenadines, Sweden, Switzerland, Thailand, Trinidad and Tobago, Turkmenistan, Tuvalu, Ukraine, United Arab Emirates, United Kingdom, United States, Uruguay, Venezuela, RB, Vietnam, West Bank and Gaza
## 
## Cluster No. 3 :
## Azerbaijan, Bahamas, The, Bahrain, Bangladesh, Belize, Benin, Burkina Faso, Cameroon, China, Colombia, Comoros, Congo, Dem. Rep., Congo, Rep., Dominican Republic, Egypt, Arab Rep., Ethiopia, Gabon, Gambia, The, Georgia, Greece, Guatemala, Honduras, India, Indonesia, Iran, Islamic Rep., Jordan, Kazakhstan, Kyrgyz Republic, Lao PDR, Lesotho, Libya, Macedonia, FYR, Madagascar, Malawi, Mauritania, Morocco, Myanmar, Namibia, Nicaragua, Nigeria, Panama, Paraguay, Poland, Puerto Rico, Qatar, Rwanda, Samoa, Sao Tome and Principe, Saudi Arabia, Senegal, Serbia, Sierra Leone, Sudan, Swaziland, Syrian Arab Republic, Tajikistan, Tanzania, Timor-Leste, Tonga, Tunisia, Turkey, Uganda, Zimbabwe
iran_clus = clusters$Cluster[clusters$Country == 'Iran, Islamic Rep.']
cat('Iran\'s cluster:', iran_clus)
## Iran's cluster: 3
pca = prcomp(mymatrix %>% select(-Country, -Cluster), center = T, scale. = T)

ggbiplot(pca, 1:2, obs.scale = 1, var.scale = 1, groups =clusters$Cluster,
         labels = clusters$Country, ellipse = TRUE, circle = TRUE)+
  guides(color = F)

بله مطلوب بوده است و به طور کلی کشورهای با آموزش بهتر در دسته ی دوم و کشورهای با آموزش متوسط در دسته ی سوم و کشورهای با آموزش ضعیف تر در دسته ی اول هستند.


۱۰. کشورهای دنیا را بر حسب ۶۰ شاخص اقتصادی، سلامت و آموزش با روش سلسله مراتبی خوشه بندی کرده و دندروگرام آن را رسم نمایید. اگر داده ها بر سه دسته تقسیم شوند ایران در کدام دسته می گنجد؟

get_multiple_indicator_gathered(indicators =
                                  c(edu_indicators, health_indicators, economic_indicators))$most_recent %>%
  select(-year) %>%
  spread(key = Indicator, value = value) %>%
  select(-Country_Code) %>%
  drop_na() %>% as.data.frame() -> mymatrix

rownames(mymatrix) <- mymatrix[,1]

mymatrix %>% select(-Country) -> mymatrix
dist = stats::dist(mymatrix, method = "euclidean")
clus = hclust(dist, method = "complete")

library(ggdendro)

ggdendrogram(clus, rotate = FALSE, size = 2) +
  ggtitle("60 Factors Dendrogram")

hcut = cutree(clus, k = 3)


data.frame(
  Country = rownames(mymatrix),
  Cluster = hcut,
  stringsAsFactors = F
) -> clus_result

for (i in clus_result$Cluster %>% unique()) {
  clusterCountries = clus_result[clus_result$Cluster == i, ]$Country
  cat("\n\nCluster No.", i, ":\n")
  cat(clusterCountries, sep = ", ")
}
## 
## 
## Cluster No. 1 :
## Afghanistan, Albania, Argentina, Armenia, Australia, Azerbaijan, Bahamas, The, Bahrain, Belarus, Belize, Benin, Bolivia, Brunei Darussalam, Bulgaria, Burkina Faso, Cabo Verde, Cameroon, Canada, Central African Republic, Chad, Chile, Colombia, Comoros, Congo, Dem. Rep., Congo, Rep., Costa Rica, Cote d'Ivoire, Croatia, Czech Republic, Denmark, Dominican Republic, Ecuador, Egypt, Arab Rep., El Salvador, Equatorial Guinea, Ethiopia, Gabon, Gambia, The, Georgia, Ghana, Guatemala, Honduras, Hungary, Iceland, Iran, Islamic Rep., Israel, Jamaica, Japan, Jordan, Kazakhstan, Kenya, Korea, Rep., Kuwait, Kyrgyz Republic, Lao PDR, Lebanon, Lesotho, Liberia, Macedonia, FYR, Madagascar, Malawi, Malaysia, Mauritania, Mauritius, Mexico, Moldova, Mongolia, Morocco, Namibia, New Zealand, Nicaragua, Norway, Panama, Paraguay, Peru, Philippines, Poland, Romania, Russian Federation, Rwanda, Saudi Arabia, Senegal, Serbia, Seychelles, Sierra Leone, South Africa, Sri Lanka, Sudan, Swaziland, Sweden, Switzerland, Syrian Arab Republic, Tajikistan, Tanzania, Thailand, Togo, Trinidad and Tobago, Tunisia, Turkey, Uganda, Ukraine, United Arab Emirates, United Kingdom, Uruguay, Venezuela, RB, Vietnam, Zimbabwe
## 
## Cluster No. 2 :
## Bangladesh, Brazil, Indonesia, Nigeria, Pakistan, United States
## 
## Cluster No. 3 :
## India
iran_clus = clus_result$Cluster[clus_result$Country == 'Iran, Islamic Rep.']
cat('Iran\'s cluster:', iran_clus)
## Iran's cluster: 1

۱۱. سه یافته جالب از داده ها استخراج کنید.

یافته ی اول: کریلیشن مثبت درصد جمعیت شهری و امید به زندگی. برخلاف اینکه مردم فکر می کنند روستایی ها طول عمر بیشتری دارند.

یافته ی دوم: کریلیشن منفی درصد زنانی که مدرک لیسانس دارند در جامعه و میانگین تعداد فرزندان زنان در جامعه.

رابطه ی مثبت درصد محققین در جامعه و درآمد آن جامعه.

############ 
### the more the population lives in the city the higher life expectancy is

cor_tester("SP.URB.TOTL.IN.ZS","SP.DYN.LE00.IN", years = c(1960,2017))$cor_test
## 
##  Spearman's rank correlation rho
## 
## data:  Y_X$Yval and Y_X$Xval
## S = 1.0567e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7518648
### right now
cor_tester("SP.URB.TOTL.IN.ZS","SP.DYN.LE00.IN")$plot
# B
############ 
### Number of Children a woman gives birth to and BS degree
### negative relationship

cor_tester("SE.TER.CUAT.BA.FE.ZS","SP.DYN.TFRT.IN", years = c(1960,2017))$cor_test
## 
##  Spearman's rank correlation rho
## 
## data:  Y_X$Yval and Y_X$Xval
## S = 1634100, p-value = 6.402e-11
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.4523253
### right now
cor_tester("SE.TER.CUAT.BA.FE.ZS","SP.DYN.TFRT.IN")$plot
# C
############ 
### Number of Reasearchers in a million and GDP per Capita relationship
### spearman correlation of 0.8

cor_tester("SP.POP.SCIE.RD.P6","NY.GDP.PCAP.KD", years = c(1960,2017))$cor_test
## 
##  Spearman's rank correlation rho
## 
## data:  Y_X$Yval and Y_X$Xval
## S = 89057000, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8014651
### right now
cor_tester("SP.POP.SCIE.RD.P6","NY.GDP.PCAP.KD")$plot